gusucode.com > 《MATLAB图像与视频处理实用案例详解》代码 > 《MATLAB图像与视频处理实用案例详解》代码/第 19 章 基于语音识别的信号灯图像模拟控制技术/voicebox/specsub.m

    function [ss,zo]=specsub(si,fsz,pp)
%SPECSUB performs speech enhancement using spectral subtraction [SS,ZO]=(S,FSZ,P)
%
% Inputs:
%   si      input speech signal
%   fsz     sample frequency in Hz
%           Alternatively, the input state from a previous call (see below)
%   pp      algorithm parameters [optional]
%
% Outputs:
%   ss      output enhanced speech (length is rounded down to the nearest frame boundary)
%   zo      output state
%
% The algorithm operation is controlled by a small number of parameters:
%
%        pp.of          % overlap factor = (fft length)/(frame increment) [2]
%        pp.ti          % desired frame increment [0.016 seconds]
%        pp.ri          % set to 1 to round ti to the nearest power of 2 samples [0]
%        pp.g           % subtraction domain: 1=magnitude, 2=power [1]
%        pp.e           % gain exponent [1]
%        pp.am          % max oversubtraction factor [3]
%        pp.b           % max noise attenutaion in power domain [0.01]
%        pp.al          % SNR for oversubtraction=am (set this to Inf for fixed a) [-5 dB]
%        pp.ah          % SNR for oversubtraction=1 [20 dB]
%        pp.bt          % threshold for binary gain or -1 for continuous gain [-1]
%        pp.mx          % input mixture gain [0]
%        pp.gh          % maximum gain for noise floor [1]
%
% Following [1], the magnitude-domain gain in each time-frequency bin is given by
%                          gain=mx+(1-mx)*max((1-(a*N/X)^(g/2))^(e/g),min(gh,(b*N/X)^(e/2)))
% where N and X are the powers of the noise and noisy speech respectively.
% The oversubtraction factor varies linearly between a=am for a frame SNR of al down to
% a=1 for a frame SNR of ah. To obtain a fixed value of a for all values of SNR, set al=Inf.
% Common exponent combinations are:
%                      g=1  e=1    Magnitude Domain spectral subtraction
%                      g=2  e=1    Power Domain spectral subtraction
%                      g=2  e=2    Wiener filtering
% Many authors use the parameters alpha=a^(g/2), beta=b^(g/2) and gamma2=e/g instead of a, b and e
% but this increases interdependence amongst the parameters.
% If bt>=0 then the max(...) expression above is thresholded to become 0 or 1.
%
% In addition it is possible to specify parameters for the noise estimation algorithm
% which implements reference [2] from which equation numbers are given in parentheses.
% They are as follows:
%
%        pp.taca      % (11): smoothing time constant for alpha_c [0.0449 seconds]
%        pp.tamax     % (3): max smoothing time constant [0.392 seconds]
%        pp.taminh    % (3): min smoothing time constant (upper limit) [0.0133 seconds]
%        pp.tpfall    % (12): time constant for P to fall [0.064 seconds]
%        pp.tbmax     % (20): max smoothing time constant [0.0717 seconds]
%        pp.qeqmin    % (23): minimum value of Qeq [2]
%        pp.qeqmax    % max value of Qeq per frame [14]
%        pp.av        % (23)+13 lines: fudge factor for bc calculation  [2.12]
%        pp.td        % time to take minimum over [1.536 seconds]
%        pp.nu        % number of subwindows to use [3]
%        pp.qith      % Q-inverse thresholds to select maximum noise slope [0.03 0.05 0.06 Inf ]
%        pp.nsmdb     % corresponding noise slope thresholds in dB/second   [47 31.4 15.7 4.1]
%
%
% If convenient, you can call specsub in chunks of arbitrary size. Thus the following are equivalent:
%
%                   (a) y=specsub(s,fs);
%
%                   (b) [y1,z]=specsub(s(1:1000),fs);
%                       [y2,z]=specsub(s(1001:2000),z);
%                       y3=specsub(s(2001:end),z);
%                       y=[y1; y2; y3];
%
% Note that the number of output samples will be less than the number of input samples if
% the input length is not an exact number of frames. In addition, if two output arguments
% are specified, the last partial frame samples will be retained for overlap adding
% with the output from the next call to specsub().
%
% See also ssubmmse() for an alternative gain function
%
% Refs:
%    [1] M. Berouti, R. Schwartz and J. Makhoul
%        Enhancement of speech corrupted by acoustic noise
%        Proc IEEE ICASSP, 1979, 4, 208-211
%    [2] Rainer Martin.
%        Noise power spectral density estimation based on optimal smoothing and minimum statistics.
%        IEEE Trans. Speech and Audio Processing, 9(5):504-512, July 2001.

%      Copyright (C) Mike Brookes 2004
%      Version: $Id: specsub.m,v 1.4 2010/11/12 14:30:44 dmb Exp $
%
%   VOICEBOX is a MATLAB toolbox for speech processing.
%   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%   This program is free software; you can redistribute it and/or modify
%   it under the terms of the GNU General Public License as published by
%   the Free Software Foundation; either version 2 of the License, or
%   (at your option) any later version.
%
%   This program is distributed in the hope that it will be useful,
%   but WITHOUT ANY WARRANTY; without even the implied warranty of
%   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
%   GNU General Public License for more details.
%
%   You can obtain a copy of the GNU General Public License from
%   http://www.gnu.org/copyleft/gpl.html or by writing to
%   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
if isstruct(fsz)
    fs=fsz.fs;
    qq=fsz.qq;
    qp=fsz.qp;
    ze=fsz.ze;
    s=zeros(length(fsz.si)+length(si(:)),1); % allocate space for speech
    s(1:length(fsz.si))=fsz.si;
    s(length(fsz.si)+1:end)=si(:);
else
    fs=fsz;     % sample frequency
    s=si(:);
    % default algorithm constants

    qq.of=2;   % overlap factor = (fft length)/(frame increment)
    qq.ti=16e-3;   % desired frame increment (16 ms)
    qq.ri=0;       % round ni to the nearest power of 2
    qq.g=1;        % subtraction domain: 1=magnitude, 2=power
    qq.e=1;        % gain exponent
    qq.am=3;      % max oversubtraction factor
    qq.b=0.01;      % noise floor
    qq.al=-5;       % SNR for maximum a (set to Inf for fixed a)
    qq.ah=20;       % SNR for minimum a
    qq.bt=-1;       % suppress binary masking
    qq.mx=0;        % no input mixing
    qq.gh=1;        % maximum gain
    if nargin>=3 && ~isempty(pp)
        qp=pp;      % save for estnoisem call
        qqn=fieldnames(qq);
        for i=1:length(qqn)
            if isfield(pp,qqn{i})
                qq.(qqn{i})=pp.(qqn{i});
            end
        end
    else
        qp=struct;  % make an empty structure
    end
end
% derived algorithm constants
if qq.ri
    ni=pow2(nextpow2(qq.ti*fs*sqrt(0.5)));
else
    ni=round(qq.ti*fs);    % frame increment in samples
end
tinc=ni/fs;          % true frame increment time

% calculate power spectrum in frames

no=round(qq.of);                                   % integer overlap factor
nf=ni*no;           % fft length
w=sqrt(hamming(nf+1))'; w(end)=[]; % for now always use sqrt hamming window
w=w/sqrt(sum(w(1:ni:nf).^2));       % normalize to give overall gain of 1
y=enframe(s,w,ni);
yf=rfft(y,nf,2);
yp=yf.*conj(yf);        % power spectrum of input speech
[nr,nf2]=size(yp);              % number of frames
if isstruct(fsz)
    [dp,ze]=estnoisem(yp,ze);   % estimate the noise using minimum statistics
    ssv=fsz.ssv;
else
    [dp,ze]=estnoisem(yp,tinc,qp);   % estimate the noise using minimum statistics
    ssv=zeros(ni*(no-1),1);             % dummy saved overlap
end
if ~nr                                  % no data frames
    ss=[];
else

    mz=yp==0;   %  mask for zero power time-frequency bins (unlikely)
    if qq.al<Inf
        ypf=sum(yp,2);
        dpf=sum(dp,2);
        mzf=dpf==0;     % zero noise frames = very high SNR
        af=1+(qq.am-1)*(min(max(10*log10(ypf./(dpf+mzf)),qq.al),qq.ah)-qq.ah)/(qq.al-qq.ah);
        af(mzf)=1;      % fix the zero noise frames
    else
        af=repmat(qq.am,nr,1);
    end
    switch qq.g
        case 1   % magnitude domain subtraction
            v=sqrt(dp./(yp+mz));
            af=sqrt(af);
            bf=sqrt(qq.b);
        case 2   % power domain subtraction
            v=dp./(yp+mz);
            bf=qq.b;
        otherwise % arbitrary subtraction domain
            v=(dp./(yp+mz)).^(0.5*qq.g);
            af=af.^(0.5*qq.g);
            bf=qq.b^(0.5*qq.g);
    end
    af =repmat(af,1,nf2);       % replicate frame oversubtraction factors for each frequency
    mf=v>=(af+bf).^(-1);        % mask for noise floor limiting
    g=zeros(size(v));           % reserve space for gain matrix
    eg=qq.e/qq.g;               % gain exponent relative to subtraction domain
    gh=qq.gh;
    switch eg
        case 1                          % Normal case
            g(mf)=min(bf*v(mf),gh);      % never give a gain > 1
            g(~mf)=1-af(~mf).*v(~mf);
        case 0.5                        
            g(mf)=min(sqrt(bf*v(mf)),gh);
            g(~mf)=sqrt(1-af(~mf).*v(~mf));
        otherwise
            g(mf)=min((bf*v(mf)).^eg,gh);
            g(~mf)=(1-af(~mf).*v(~mf)).^eg;
    end
    if qq.bt>=0
        g=g>qq.bt;
    end
    g=qq.mx+(1-qq.mx)*g;   % mix in some of the input
    se=(irfft((yf.*g).',nf).').*repmat(w,nr,1);   % inverse dft and apply output window
    ss=zeros(ni*(nr+no-1),no);                      % space for overlapped output speech
    ss(1:ni*(no-1),end)=ssv;
    for i=1:no
        nm=nf*(1+floor((nr-i)/no));  % number of samples in this set
        ss(1+(i-1)*ni:nm+(i-1)*ni,i)=reshape(se(i:no:nr,:)',nm,1);
    end
    ss=sum(ss,2);

end
if nargout>1
    if nr
        zo.ssv=ss(end-ni*(no-1)+1:end);    % save the output tail for next time
        ss(end-ni*(no-1)+1:end)=[];
    else
        zo.ssv=ssv;  %
    end
    zo.si=s(length(ss)+1:end);      % save the tail end of the input speech signal
    zo.fs=fs;                       % save sample frequency
    zo.qq=qq;                       % save loval parameters
    zo.qp=qp;                       % save estnoisem parameters
    zo.ze=ze;                       % save state of noise estimation
end
if ~nargout && nr>0
    ttax=(1:nr)*tinc;
    ffax=(0:size(g,2)-1)*fs/nf/1000;    ax=zeros(4,1);
    ax(1)=subplot(223);
    imagesc(ttax,ffax,20*log10(g)');
    colorbar;
    axis('xy');
    if qq.al==Inf
        title(sprintf('Filter Gain (dB): a=%.2g, b=%.3g',qq.am,qq.b));
    else
        title(sprintf('Filter Gain (dB): a=%.2g (%.0f to %.0fdB), b=%.3g',qq.am,qq.al,qq.ah,qq.b));
    end
    xlabel('Time (s)');
    ylabel('Frequency (kHz)');

    ax(2)=subplot(222);
    imagesc(ttax,ffax,10*log10(yp)');
    colorbar;
    axis('xy');
    title('Noisy Speech (dB)');
    xlabel('Time (s)');
    ylabel('Frequency (kHz)');

    ax(3)=subplot(224);
    imagesc(ttax,ffax,10*log10(yp.*g.^2)');
    colorbar;
    axis('xy');
    title(sprintf('Enhanced Speech (dB): g=%.2g, e=%.3g',qq.g,qq.e));
    xlabel('Time (s)');
    ylabel('Frequency (kHz)');

    ax(4)=subplot(221);
    imagesc(ttax,ffax,10*log10(dp)');
    colorbar;
    axis('xy');
    title('Noise Estimate (dB)');
    xlabel('Time (s)');
    ylabel('Frequency (kHz)');
    linkaxes(ax);
end